{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### get raw data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set workspace to Windrecorder dir\n",
    "import os\n",
    "import io\n",
    "os.chdir(\"..\")\n",
    "os.chdir(\"..\")\n",
    "# ------------------------------------------------------------\n",
    "\n",
    "import datetime\n",
    "import pandas as pd\n",
    "\n",
    "from windrecorder.db_manager import db_manager\n",
    "from windrecorder.oneday import OneDay\n",
    "from windrecorder.record_wintitle import get_wintitle_stat_in_day\n",
    "from windrecorder.config import config\n",
    "from windrecorder import llm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# datetime_select = datetime.datetime.today()\n",
    "datetime_select = datetime.datetime(2024,8,19)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_day_search_result = OneDay().search_day_data(\n",
    "                    datetime_select,\n",
    "                    search_content=\"\",\n",
    "                )\n",
    "\n",
    "df_day_search_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_day_activity_raw, day_time_sum = get_wintitle_stat_in_day(datetime_select, optimize_for_display=False)\n",
    "df_day_activity_optimize_display, _ = get_wintitle_stat_in_day(datetime_select, optimize_for_display=True)\n",
    "\n",
    "df_day_activity_optimize_display"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### clean and format data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert day records datetime format\n",
    "df_day_search_result['videofile_time'] = pd.to_datetime(df_day_search_result['videofile_time'], unit='s').dt.strftime('%Y-%m-%d %H-%M-%S')\n",
    "\n",
    "# config\n",
    "day_records_clean_df = pd.DataFrame(columns=['datetime', 'activity', 'ocr_content'])\n",
    "activity_compare_lst = []\n",
    "activity_always_exclude_lst = [\"explorer.exe\"]\n",
    "vaild_day_activity_lst = []\n",
    "max_token_limit = 1000000/2.5\n",
    "max_pre_row_token = int(max_token_limit/len(df_day_search_result))\n",
    "activity_deduplication_trace_depth = 10\n",
    "MIN_ACTIVITY_DURATION_SECOND = 30\n",
    "MAX_DAY_ACTIVITY_HEAD_TRUNCATION_FOR_LLM = 20\n",
    "\n",
    "# Only keep long-lasting activities\n",
    "for index, row in df_day_activity_raw.iterrows():\n",
    "    if row['Screen Time'] < MIN_ACTIVITY_DURATION_SECOND:\n",
    "        continue\n",
    "    vaild_day_activity_lst.append(row['Page'])\n",
    "\n",
    "# filtering day records\n",
    "for index, row in df_day_search_result.iterrows():\n",
    "    activity = row['win_title']\n",
    "    \n",
    "    # 剔除过近的重复项、活动时间过短的项\n",
    "    if activity in activity_compare_lst[-activity_deduplication_trace_depth:] or activity in activity_always_exclude_lst or activity not in vaild_day_activity_lst or len(activity.split(\" | \")[0]) == 0:\n",
    "        continue\n",
    "    activity_compare_lst.append(activity)\n",
    "    ocr_text = row['ocr_text']\n",
    "    \n",
    "    row = {\n",
    "        'datetime': row['videofile_time'],\n",
    "        'activity': activity.split(\" | \")[0],\n",
    "        'ocr_content': row['ocr_text'][:max_pre_row_token],\n",
    "    }\n",
    "    day_records_clean_df.loc[len(day_records_clean_df )] = row\n",
    "\n",
    "df_day_activity_optimize_display = df_day_activity_optimize_display.head(MAX_DAY_ACTIVITY_HEAD_TRUNCATION_FOR_LLM)\n",
    "\n",
    "day_records_clean_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_day_activity_optimize_display"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### generate prompt context"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_df_to_csv_str(df:pd.DataFrame):\n",
    "    csv_buffer = io.StringIO()\n",
    "    df.to_csv(csv_buffer, index=False)\n",
    "    csv_string = csv_buffer.getvalue()\n",
    "    return csv_string\n",
    "\n",
    "prompt_day_wintitle_activity = convert_df_to_csv_str(df_day_activity_optimize_display)\n",
    "prompt_day_records = convert_df_to_csv_str(day_records_clean_df)\n",
    "\n",
    "prompt_mount = f\"\"\"\n",
    "### Top {MAX_DAY_ACTIVITY_HEAD_TRUNCATION_FOR_LLM} activities by user screen time for the day, sorted by time, highest to lowest (CSV format):\n",
    "---Start of screen time ranking csv---\n",
    "{prompt_day_wintitle_activity}\n",
    "---END of screen time ranking csv---\n",
    "\n",
    "### Part of the user's specific activities on that day, sorted by time (CSV format) (ocr_content contains a lot of noise, please refer to it with very low weight and only focus on the key content information related to the activity title. Pay more attention to activity and only use ocr_content as an optional supplement.):\n",
    "---Start of activities csv---\n",
    "{prompt_day_records}\n",
    "---END of activities csv---\n",
    "\"\"\"\n",
    "\n",
    "# prompt_mount = f\"\"\"\n",
    "# ### Top {MAX_DAY_ACTIVITY_HEAD_TRUNCATION_FOR_LLM} activities by user screen time for the day, sorted by time, highest to lowest (CSV format):\n",
    "# ---Start of screen time ranking csv---\n",
    "# {prompt_day_wintitle_activity}\n",
    "# ---END of screen time ranking csv---\n",
    "# \"\"\"\n",
    "\n",
    "system_prompt = f\"\"\"\n",
    "You are a user behavior analysis expert. Please generate a simple summary of the user's daily activities on that day based on the following computer activity records (screen time ranking, specific activity tracks) of the user ({config.user_name}) received on that day, describing what the user did and what content browsed on that day. If necessary, please retain the proper nouns or origin text of the activity content.\n",
    "\"\"\"\n",
    "prompt_end = \"\"\"\n",
    "Now, please summarize the user's daily activities in chronological order based on the above. Keep it concise, clear, simple, and insightful. Please output the results directly without adding additional instructions.\n",
    "\"\"\"\n",
    "\n",
    "messages = [\n",
    "    {\"role\": \"system\", \"content\": system_prompt},\n",
    "    {\"role\": \"user\", \"content\": prompt_mount + prompt_end}\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from openai import OpenAI\n",
    "\n",
    "# # Point to the local server\n",
    "# client = OpenAI(base_url=\"http://localhost:1234/v1\", api_key=\"lm-studio\")\n",
    "\n",
    "# completion = client.chat.completions.create(\n",
    "#   model=\"PrunaAI/Phi-3-mini-128k-instruct-GGUF-Imatrix-smashed\",\n",
    "#   messages=messages,\n",
    "#   temperature=0.7,\n",
    "# )\n",
    "\n",
    "# print(completion.choices[0].message)\n",
    "\n",
    "success, plain_text = llm.request_llm_one_shot(\n",
    "    user_content=prompt_mount + prompt_end,\n",
    "    system_prompt=system_prompt,\n",
    "    temperature=.3,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(plain_text)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
